library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(ggfortify)
library(colorRamps)
library(htmlTable)
#this function return simulated ICC data
get_data<-function(target, n, k, mu, sigmaw){
#create label for group
group <- factor(rep(1:k, each = n/k))
#create simulated y value for simulated target
sigmab<-sqrt(sigmaw^2*target/(1-target))
value <- c(sapply(rnorm(k,mu,sigmab), function(mui) rnorm(n/k, mui, sigmaw)))#observations
data <- data.frame(group = group, value = value)
return(data)
}
Here, a function was developed to calculate the purity when evaluating clustering algorithm: \[purity(\omega, c)=\frac{1}{N}\sum^{k}max_{j}|\omega_k \cap c_j|\]
#this function returns purity after applying clustering algorithms
purity<-function(clusters, classes) {
sum(apply(table(classes, clusters), 2, max)) / length(clusters)
}
Using the above two functions to evaluate the performance of k-means clustering
#set seed for simulations
set.seed(1117)
#get simulated data
dat<-replicate(50, get_data(target = 0.3, n = 100, k = 3, mu = 5, sigmaw = 0.1)$value)
dat<-as.data.frame(dat)
dat$group<-get_data(target = 0.3, n = 100, k = 3, mu = 5, sigmaw = 0.1)$group
#apply k-mean cluster analysis with k=5
fit<-kmeans(dat[,c(1:50)], 5)
# append cluster assignment
dat <- data.frame(dat, fit$cluster)
#calculate the rate of misclassified observations
rate<-1-purity(dat$fit.cluster, dat$group)
Based on the number of misclassification, we could conclude that when we applied 5-means clustering on a data with a real group of three, there are nearly 31.3% false rate.
b). Apply different ICC value from 0.1 to 0.9 and visualize the rate of purity against ICC by ggplot2:
#this function returns purity for measuring accuracy of k-mean clustering
get.purity<-function(target, n, k, kmeans, p){
#get simulated data
dat<-replicate(p, get_data(target, n, k, mu = 5, sigmaw = 0.1)$value)
dat<-as.data.frame(dat)
dat$group<- as.numeric(get_data(target, n, k, mu = 5, sigmaw = 0.1)$group)
#apply k-mean cluster analysis with k=5
set.seed(222)
fit<-kmeans(dat[,c(1:p)], kmeans)
#append cluster assignment
dat <- data.frame(dat, fit$cluster)
#calculate the rate of purity
return(purity(dat$fit.cluster, dat$group))
}
#get results for varing ICC
set.seed(24)
purity_result<-sapply(seq(0.1,0.9, 0.05), function(x) get.purity(target = x, n = 100, k = 3,
kmeans = 3, p = 50))
summary<-data.frame(ICC = seq(0.1,0.9, 0.05), purity_result)
#using ggplot2 to visualize the results
ggplot(data = summary, aes(x = ICC, y = purity_result))+
geom_line()+
geom_point(size=2)+
scale_y_continuous(name='Purity')+
geom_hline(yintercept = 1, linetype = 'dashed')+
expand_limits(y=0)+
theme_classic()
We found when ICC reaches to 0.2, there is perfect clustering after applying 3-means clustering on a simulated data with a group of 3.The required ICC for effective clustering would be at least 0.2.
c). Using minimum ICC as 0.2. The data with noise variables were generated as following:
get_noise_data<-function(x, p = 50, n = 100, k =3, ICC = 0.2){
if(x!= p && x!=0){
temp1<- as.data.frame(replicate(p-x, get_data(ICC, n, k, mu = 5, sigmaw =
0.1)$value))
temp2<- as.data.frame(replicate(x, get_data(target = 0, n, k, mu = 5,
sigmaw = 0.1)$value))
temp<- data.frame(group = get_data(ICC, n, k, mu = 5, sigmaw =
0.1)$group , temp1, temp2)}
#if all variables are noisy
if(x == p){
temp<-as.data.frame(replicate(p, get_data(target = 0, n, k, mu = 5, sigmaw
= 0.1)$value))
temp<-data.frame(group = get_data(target = 0, n, k, mu = 5, sigmaw =
0.1)$group, temp)
}
#none of the variables are noisy
if(x == 0){
temp<-as.data.frame(replicate(p, get_data(target = ICC, n, k, mu = 5,
sigmaw = 0.1)$value))
temp<-data.frame(group = get_data(target = ICC, n, k, mu = 5, sigmaw =
0.1)$group, temp)
}
names(temp)<-c("group", paste0("X", c(1:p)))
return(temp)
}
#number of noise variables t
get.purity.v2<-function(t, k, kmeans, n = 100, p =50, icc = 0.2){
dat<-get_noise_data(x = t, p, n, k, icc)
set.seed(222)
fit<-kmeans(dat[,c(2:(p+1))], kmeans)
#append cluster assignment
dat <- data.frame(dat, fit$cluster)
#calculate the rate of misclassification
return(purity(dat$fit.cluster, dat$group))
}
set.seed(47)
#summary the results for varying noise variables
sum<-sapply(seq(0, 50, 5), function(x) get.purity.v2(x, k = 3, kmeans = 3, 100, 50, 0.2))
#visualize the results by ggplot2
temp<-data.frame(Noisy = seq(0, 50, 5), Purity = sum)
ggplot(data = temp, aes(x = Noisy, y = Purity))+
geom_line()+
geom_point(size=2)+
scale_y_continuous(name ='Purity')+
scale_x_continuous(name ='Number of Noisy Variables')+
expand_limits(y=0)+
theme_classic()
Comments: when the number of noisy variables increases, the purity of k-means clustering is reduced especially when it’s more than the number of non-noisy variables.
d). when we control the number of noisy variables and target ICC:
set.seed(117)
ICC<-sample(seq(0.05, 0.95, 0.001),200)
noisy<-sample(seq(0, 50, 1), 200, replace = TRUE)
purity0<-mapply(function(x,y) get.purity.v2(x, 3, 3, 100, 50, y), noisy, ICC)
temp2<-data.frame(noisy, ICC, Purity = purity0)
#scale the variables of noisy variable and ICC value
temp2[,c(1:2)]<-scale(temp2[,c(1:2)])
#fit lm model to explore interaction
model<-lm(Purity~ noisy*ICC, data = temp2)
summary(model)
##
## Call:
## lm(formula = Purity ~ noisy * ICC, data = temp2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.44892 -0.05557 0.00315 0.06334 0.19338
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.892481 0.007197 124.011 < 2e-16 ***
## noisy -0.069218 0.007219 -9.588 < 2e-16 ***
## ICC 0.119077 0.007215 16.504 < 2e-16 ***
## noisy:ICC 0.050586 0.007267 6.961 4.97e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1016 on 196 degrees of freedom
## Multiple R-squared: 0.6668, Adjusted R-squared: 0.6617
## F-statistic: 130.7 on 3 and 196 DF, p-value: < 2.2e-16
#visualize the results
pp<-function(n){
set.seed(117)
ICC<-sample(seq(0.05, 0.95, 0.001), n)
Noisy<-sample(seq(0, 50, 1), n, replace = TRUE)
df<-expand.grid(x = ICC, y = Noisy)
df$Purity<-mapply(function(x,y) get.purity.v2(x, 3, 3, 100, 50, y), Noisy, ICC)
df
}
p <- ggplot(pp(200), aes(x=x,y=y))
p <- p + geom_tile(aes(fill=Purity))+
scale_x_continuous(name ='ICC')+
scale_y_continuous(name ='Number of Noisy Variables')+
scale_color_gradientn(colours=matlab.like(5))+
theme_classic()
ggplotly(p)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
Based on the ANOVA table, we could conclude that purity by k-means clustering is negatively correlated with the number of noisy variables and positively correlated with ICC value. In addition, we observed significant interaction between number of noisy variables. According to the plot, we found more low purity occurrs when ICC is low and the number of noisy variables is large.
2.PCA and clustering When ICC=0.5, it’s likely to uncover clusters. Biplot of first two PCs were shown as below:
From the scree plot, we find the first two PCs are meaningful.
b). When the k=4, based on the scree plot, we find the meaningful PCs are the first three. From the biplot, we found there were some overlapping points in group 2 and 4. The group separation by PC1 is not as good as the one when k=3. Thus, we conclude that when the number of “true group” increases, there is more need for meaningful PCs to recapture the orignial structure of the data.
#set working directory
library(xlsx)
## Loading required package: rJava
## Loading required package: xlsxjars
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:plotly':
##
## subplot
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
library(cluster)
library(fpc)
library(lattice)
setwd("C:/Users/bitga/Downloads/")
mice_dat<-read.xlsx("Data_Cortex_Nuclear.xls", 1, header = TRUE)
mice_dat$MouseID<-as.character(mice_dat$MouseID)
describe(mice_dat[,c(2:78)])
## mice_dat[, c(2:78)]
##
## 77 Variables 1080 Observations
## ---------------------------------------------------------------------------
## DYRK1A_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1077 1 0.4258 0.2083 0.2241 0.2486
## .25 .50 .75 .90 .95
## 0.2881 0.3664 0.4877 0.6447 0.7693
##
## lowest : 0.1453265 0.1568492 0.1633245 0.1643295 0.1684934
## highest: 2.2018314 2.3358022 2.4599481 2.4803157 2.5163674
## ---------------------------------------------------------------------------
## ITSN1_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1076 1 0.6171 0.2276 0.3717 0.4010
## .25 .50 .75 .90 .95
## 0.4734 0.5658 0.6980 0.8631 0.9899
##
## lowest : 0.2453585 0.2611850 0.2640852 0.2753054 0.2863181
## highest: 2.4886839 2.5194637 2.5390242 2.5801038 2.6026621
## ---------------------------------------------------------------------------
## BDNF_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1077 1 0.3191 0.05521 0.2428 0.2563
## .25 .50 .75 .90 .95
## 0.2874 0.3166 0.3482 0.3849 0.4044
##
## lowest : 0.1151814 0.1859795 0.1941596 0.1972707 0.1981587
## highest: 0.4648484 0.4649639 0.4666728 0.4700556 0.4971599
## ---------------------------------------------------------------------------
## NR1_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1077 1 2.297 0.3927 1.726 1.867
## .25 .50 .75 .90 .95
## 2.057 2.297 2.528 2.732 2.890
##
## lowest : 1.330831 1.346827 1.403329 1.414914 1.440000
## highest: 3.147747 3.164237 3.174743 3.337822 3.757641
## ---------------------------------------------------------------------------
## NR2A_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1077 1 3.844 1.047 2.510 2.726
## .25 .50 .75 .90 .95
## 3.156 3.761 4.440 5.119 5.537
##
## lowest : 1.737540 1.794716 1.814940 1.836636 1.860128
## highest: 6.492660 6.766150 7.031754 7.120759 8.482553
## ---------------------------------------------------------------------------
## pAKT_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1076 1 0.2332 0.04534 0.1748 0.1853
## .25 .50 .75 .90 .95
## 0.2058 0.2312 0.2573 0.2841 0.3016
##
## lowest : 0.06323601 0.07548701 0.07946370 0.08635097 0.12099984
## highest: 0.35997415 0.36971299 0.39228530 0.43462381 0.53905013
## ---------------------------------------------------------------------------
## pBRAF_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1075 1 0.1818 0.02968 0.1413 0.1479
## .25 .50 .75 .90 .95
## 0.1646 0.1823 0.1974 0.2140 0.2257
##
## lowest : 0.06404259 0.06461039 0.06785481 0.09800699 0.10764925
## highest: 0.25964912 0.27419355 0.27946458 0.31295488 0.31706559
## ---------------------------------------------------------------------------
## pCAMKII_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1077 1 3.537 1.466 1.854 2.043
## .25 .50 .75 .90 .95
## 2.480 3.327 4.482 5.384 5.938
##
## lowest : 1.343998 1.369898 1.393799 1.402336 1.414933
## highest: 6.947290 6.951164 7.021525 7.104642 7.464070
## ---------------------------------------------------------------------------
## pCREB_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1077 1 0.2126 0.03675 0.1602 0.1713
## .25 .50 .75 .90 .95
## 0.1908 0.2106 0.2346 0.2547 0.2672
##
## lowest : 0.1128118 0.1274008 0.1274832 0.1327651 0.1348824
## highest: 0.3024105 0.3045564 0.3051907 0.3057732 0.3062472
## ---------------------------------------------------------------------------
## pELK_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1077 1 1.429 0.4031 0.9166 1.0280
## .25 .50 .75 .90 .95
## 1.2037 1.3558 1.5613 1.7776 2.0011
##
## lowest : 0.4290323 0.7207668 0.7342256 0.7550437 0.7701149
## highest: 4.4973374 4.6878975 4.8579761 4.9426036 6.1133475
## ---------------------------------------------------------------------------
## pERK_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1077 1 0.5459 0.302 0.2650 0.2955
## .25 .50 .75 .90 .95
## 0.3374 0.4436 0.6633 0.8854 1.0370
##
## lowest : 0.1491552 0.1609499 0.1795298 0.1897619 0.1912208
## highest: 2.8068632 2.9163966 3.0645281 3.2892735 3.5666854
## ---------------------------------------------------------------------------
## pJNK_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1076 1 0.3135 0.057 0.2230 0.2455
## .25 .50 .75 .90 .95
## 0.2812 0.3213 0.3487 0.3726 0.3899
##
## lowest : 0.05211039 0.05565414 0.05706344 0.07934163 0.08177522
## highest: 0.42277359 0.42427943 0.44455213 0.46906841 0.49342586
## ---------------------------------------------------------------------------
## PKCA_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1077 1 0.3179 0.05912 0.2352 0.2542
## .25 .50 .75 .90 .95
## 0.2818 0.3130 0.3523 0.3915 0.4100
##
## lowest : 0.1914307 0.1947279 0.1963828 0.1979837 0.2026424
## highest: 0.4534500 0.4603215 0.4628048 0.4638056 0.4739920
## ---------------------------------------------------------------------------
## pMEK_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1077 1 0.275 0.05127 0.2001 0.2151
## .25 .50 .75 .90 .95
## 0.2443 0.2774 0.3034 0.3306 0.3507
##
## lowest : 0.05681818 0.06968866 0.10595160 0.14645522 0.15120782
## highest: 0.41694421 0.42176128 0.42388451 0.43827611 0.45800055
## ---------------------------------------------------------------------------
## pNR1_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1077 1 0.8258 0.1322 0.6448 0.6843
## .25 .50 .75 .90 .95
## 0.7435 0.8211 0.8985 0.9795 1.0243
##
## lowest : 0.5001597 0.5034538 0.5167739 0.5186909 0.5208758
## highest: 1.1395349 1.1609035 1.1852789 1.2447215 1.4081688
## ---------------------------------------------------------------------------
## pNR2A_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1077 1 0.7269 0.2125 0.4375 0.4903
## .25 .50 .75 .90 .95
## 0.5903 0.7196 0.8486 0.9796 1.0567
##
## lowest : 0.2812848 0.3058339 0.3071190 0.3127208 0.3175686
## highest: 1.2912586 1.2942517 1.3393715 1.3599832 1.4127502
## ---------------------------------------------------------------------------
## pNR2B_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1077 1 1.562 0.303 1.129 1.225
## .25 .50 .75 .90 .95
## 1.381 1.564 1.749 1.912 1.987
##
## lowest : 0.3016086 0.3144841 0.3374778 0.8119808 0.8282731
## highest: 2.2385174 2.2455228 2.2956872 2.3752286 2.7239654
## ---------------------------------------------------------------------------
## pPKCAB_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1077 1 1.525 0.5381 0.9056 1.0072
## .25 .50 .75 .90 .95
## 1.1683 1.3657 1.8859 2.2623 2.4003
##
## lowest : 0.5678405 0.5989071 0.6461791 0.6560623 0.6581806
## highest: 2.8253239 2.8267220 2.8409339 2.9324093 3.0613871
## ---------------------------------------------------------------------------
## pRSK_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1077 1 0.4428 0.07282 0.3380 0.3657
## .25 .50 .75 .90 .95
## 0.4041 0.4406 0.4821 0.5290 0.5535
##
## lowest : 0.09594156 0.09743507 0.10725965 0.15978078 0.15994936
## highest: 0.62896711 0.63233602 0.63261819 0.63360740 0.65096181
## ---------------------------------------------------------------------------
## AKT_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1077 1 0.6822 0.1412 0.4819 0.5394
## .25 .50 .75 .90 .95
## 0.5968 0.6825 0.7597 0.8476 0.8929
##
## lowest : 0.06442119 0.06444805 0.07114051 0.31888227 0.33450120
## highest: 1.03657523 1.04282188 1.05856833 1.06077147 1.18217474
## ---------------------------------------------------------------------------
## BRAF_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1077 1 0.3785 0.1749 0.2144 0.2337
## .25 .50 .75 .90 .95
## 0.2643 0.3267 0.4136 0.5841 0.6622
##
## lowest : 0.1438936 0.1462868 0.1528569 0.1545428 0.1578467
## highest: 1.9403759 1.9954571 1.9992518 2.0888322 2.1334157
## ---------------------------------------------------------------------------
## CAMKII_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1077 1 0.3634 0.05789 0.2801 0.2994
## .25 .50 .75 .90 .95
## 0.3309 0.3603 0.3939 0.4303 0.4540
##
## lowest : 0.2129595 0.2222004 0.2324005 0.2352753 0.2356193
## highest: 0.5319950 0.5513820 0.5635500 0.5740181 0.5862445
## ---------------------------------------------------------------------------
## CREB_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1073 1 0.1805 0.02919 0.1418 0.1480
## .25 .50 .75 .90 .95
## 0.1618 0.1796 0.1957 0.2128 0.2249
##
## lowest : 0.1136364 0.1155045 0.1178280 0.1198825 0.1227477
## highest: 0.2752204 0.2752387 0.2754036 0.2867540 0.3195582
## ---------------------------------------------------------------------------
## ELK_N
## n missing distinct Info Mean Gmd .05 .10
## 1062 18 1062 1 1.173 0.3547 0.7749 0.8478
## .25 .50 .75 .90 .95
## 0.9444 1.0962 1.3236 1.6568 1.8402
##
## lowest : 0.4976950 0.5740672 0.5978463 0.6022750 0.6045541
## highest: 2.5244799 2.5713606 2.5782326 2.6580195 2.8029483
## ---------------------------------------------------------------------------
## ERK_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1077 1 2.474 0.7295 1.550 1.684
## .25 .50 .75 .90 .95
## 1.992 2.401 2.873 3.422 3.640
##
## lowest : 1.131796 1.169514 1.174350 1.209362 1.238674
## highest: 4.447951 4.643823 4.804018 4.892803 5.198404
## ---------------------------------------------------------------------------
## GSK3B_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1077 1 1.173 0.2605 0.8102 0.8866
## .25 .50 .75 .90 .95
## 1.0231 1.1598 1.3097 1.4441 1.5159
##
## lowest : 0.1511243 0.5373899 0.5748255 0.5902113 0.6048699
## highest: 2.3230113 2.3683565 2.4377254 2.4401444 2.4757512
## ---------------------------------------------------------------------------
## JNK_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1077 1 0.2416 0.03658 0.1868 0.2005
## .25 .50 .75 .90 .95
## 0.2204 0.2449 0.2633 0.2804 0.2881
##
## lowest : 0.04629779 0.05584416 0.06245912 0.06457058 0.06597808
## highest: 0.32198500 0.32400403 0.33420094 0.36140483 0.38719068
## ---------------------------------------------------------------------------
## MEK_N
## n missing distinct Info Mean Gmd .05 .10
## 1073 7 1072 1 0.2728 0.04623 0.2004 0.2180
## .25 .50 .75 .90 .95
## 0.2471 0.2734 0.3008 0.3241 0.3379
##
## lowest : 0.1472015 0.1501014 0.1546003 0.1632522 0.1652432
## highest: 0.3828926 0.3833505 0.3907032 0.3966399 0.4154079
## ---------------------------------------------------------------------------
## TRKA_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1075 1 0.6932 0.1356 0.4845 0.5278
## .25 .50 .75 .90 .95
## 0.6171 0.7050 0.7742 0.8417 0.8829
##
## lowest : 0.1987434 0.2104558 0.2220249 0.3525928 0.3624257
## highest: 0.9567257 0.9598168 0.9795088 0.9800098 1.0016229
## ---------------------------------------------------------------------------
## RSK_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1074 1 0.1684 0.03118 0.1254 0.1335
## .25 .50 .75 .90 .95
## 0.1496 0.1667 0.1845 0.2046 0.2185
##
## lowest : 0.1073944 0.1117441 0.1128538 0.1136638 0.1137255
## highest: 0.2699050 0.2758370 0.2801814 0.2814722 0.3051360
## ---------------------------------------------------------------------------
## APP_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1077 1 0.4048 0.06861 0.3080 0.3281
## .25 .50 .75 .90 .95
## 0.3663 0.4020 0.4419 0.4880 0.5111
##
## lowest : 0.2355954 0.2423562 0.2478524 0.2553626 0.2560951
## highest: 0.5735264 0.5900151 0.5906473 0.6160290 0.6326627
## ---------------------------------------------------------------------------
## Bcatenin_N
## n missing distinct Info Mean Gmd .05 .10
## 1062 18 1062 1 2.147 0.4936 1.482 1.597
## .25 .50 .75 .90 .95
## 1.827 2.115 2.424 2.734 2.948
##
## lowest : 1.134886 1.173540 1.179566 1.185019 1.202827
## highest: 3.253195 3.266385 3.291441 3.299918 3.680552
## ---------------------------------------------------------------------------
## SOD1_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1077 1 0.5426 0.2961 0.2705 0.2866
## .25 .50 .75 .90 .95
## 0.3196 0.4441 0.6958 0.9358 1.0757
##
## lowest : 0.2171202 0.2303602 0.2339398 0.2339517 0.2398032
## highest: 1.5919598 1.6105212 1.7144296 1.8620317 1.8728985
## ---------------------------------------------------------------------------
## MTOR_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1077 1 0.4525 0.07304 0.3498 0.3731
## .25 .50 .75 .90 .95
## 0.4104 0.4525 0.4880 0.5392 0.5663
##
## lowest : 0.2011434 0.2085514 0.2169581 0.2400371 0.2426975
## highest: 0.6277325 0.6457956 0.6468241 0.6515966 0.6767480
## ---------------------------------------------------------------------------
## P38_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1075 1 0.4153 0.09833 0.2890 0.3077
## .25 .50 .75 .90 .95
## 0.3520 0.4078 0.4663 0.5235 0.5804
##
## lowest : 0.2278804 0.2360062 0.2365752 0.2375201 0.2423227
## highest: 0.7219355 0.7530089 0.7656140 0.7674141 0.9332563
## ---------------------------------------------------------------------------
## pMTOR_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1077 1 0.759 0.1331 0.5829 0.6208
## .25 .50 .75 .90 .95
## 0.6835 0.7608 0.8415 0.9067 0.9463
##
## lowest : 0.1665787 0.1811408 0.1918159 0.2088957 0.2132967
## highest: 1.0525444 1.0582724 1.0780362 1.1053106 1.1248834
## ---------------------------------------------------------------------------
## DSCR1_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1077 1 0.5852 0.1069 0.4477 0.4796
## .25 .50 .75 .90 .95
## 0.5309 0.5767 0.6344 0.7103 0.7671
##
## lowest : 0.1553210 0.1765216 0.1781864 0.1855825 0.1978178
## highest: 0.8884452 0.8928134 0.8947796 0.8949178 0.9164295
## ---------------------------------------------------------------------------
## AMPKA_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1075 1 0.3684 0.0687 0.2847 0.3001
## .25 .50 .75 .90 .95
## 0.3266 0.3585 0.4008 0.4555 0.4838
##
## lowest : 0.2264087 0.2326615 0.2346786 0.2366105 0.2367573
## highest: 0.5641799 0.5707633 0.5984155 0.6149626 0.7008385
## ---------------------------------------------------------------------------
## NR2B_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1077 1 0.5653 0.09556 0.4340 0.4686
## .25 .50 .75 .90 .95
## 0.5149 0.5635 0.6145 0.6794 0.7115
##
## lowest : 0.1847845 0.2039386 0.2042758 0.2066598 0.2210465
## highest: 0.8100804 0.8264809 0.8843568 0.9298017 0.9720198
## ---------------------------------------------------------------------------
## pNUMB_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1077 1 0.3571 0.06999 0.2694 0.2823
## .25 .50 .75 .90 .95
## 0.3128 0.3474 0.3927 0.4457 0.4760
##
## lowest : 0.1855976 0.2116105 0.2174155 0.2222222 0.2247272
## highest: 0.5624407 0.5722211 0.5814696 0.5956775 0.6310522
## ---------------------------------------------------------------------------
## RAPTOR_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1077 1 0.3158 0.05997 0.2425 0.2559
## .25 .50 .75 .90 .95
## 0.2761 0.3049 0.3473 0.3938 0.4204
##
## lowest : 0.1948245 0.2070140 0.2159514 0.2184738 0.2208304
## highest: 0.5016442 0.5042681 0.5089263 0.5089783 0.5266814
## ---------------------------------------------------------------------------
## TIAM1_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1075 1 0.4186 0.07391 0.3274 0.3422
## .25 .50 .75 .90 .95
## 0.3720 0.4072 0.4560 0.5094 0.5456
##
## lowest : 0.2377771 0.2592782 0.2704333 0.2800111 0.2832882
## highest: 0.6554954 0.6608612 0.6971869 0.7146080 0.7221216
## ---------------------------------------------------------------------------
## pP70S6_N
## n missing distinct Info Mean Gmd .05 .10
## 1077 3 1076 1 0.3945 0.171 0.1728 0.1998
## .25 .50 .75 .90 .95
## 0.2811 0.3777 0.4811 0.5851 0.6770
##
## lowest : 0.1311198 0.1345354 0.1355658 0.1398453 0.1403043
## highest: 1.0100939 1.0482345 1.0854010 1.0872595 1.1291715
## ---------------------------------------------------------------------------
## NUMB_N
## n missing distinct Info Mean Gmd .05 .10
## 1080 0 1080 1 0.1811 0.03246 0.1400 0.1473
## .25 .50 .75 .90 .95
## 0.1593 0.1782 0.1972 0.2198 0.2354
##
## lowest : 0.1179985 0.1184910 0.1209077 0.1225590 0.1227794
## highest: 0.2805684 0.2809826 0.3052795 0.3064755 0.3165753
## ---------------------------------------------------------------------------
## P70S6_N
## n missing distinct Info Mean Gmd .05 .10
## 1080 0 1080 1 0.9431 0.1932 0.6826 0.7293
## .25 .50 .75 .90 .95
## 0.8267 0.9313 1.0451 1.1842 1.2493
##
## lowest : 0.3441198 0.3592496 0.4711711 0.5831042 0.5895028
## highest: 1.4348178 1.4401663 1.5755662 1.6075684 1.6799532
## ---------------------------------------------------------------------------
## pGSK3B_N
## n missing distinct Info Mean Gmd .05 .10
## 1080 0 1080 1 0.1612 0.02121 0.1321 0.1371
## .25 .50 .75 .90 .95
## 0.1493 0.1602 0.1717 0.1860 0.1950
##
## lowest : 0.09997585 0.10924965 0.11503639 0.11510337 0.11624730
## highest: 0.22691722 0.22779860 0.24349387 0.24430931 0.25321009
## ---------------------------------------------------------------------------
## pPKCG_N
## n missing distinct Info Mean Gmd .05 .10
## 1080 0 1080 1 1.707 0.6596 0.7864 0.9501
## .25 .50 .75 .90 .95
## 1.2968 1.6646 2.1130 2.5103 2.6893
##
## lowest : 0.5987666 0.6005737 0.6031650 0.6058048 0.6126829
## highest: 3.1968893 3.1993032 3.2440019 3.2715417 3.3819763
## ---------------------------------------------------------------------------
## CDK5_N
## n missing distinct Info Mean Gmd .05 .10
## 1080 0 1080 1 0.2924 0.0386 0.2330 0.2452
## .25 .50 .75 .90 .95
## 0.2726 0.2938 0.3125 0.3321 0.3501
##
## lowest : 0.1811570 0.1975619 0.2030083 0.2049701 0.2050037
## highest: 0.3850295 0.3953077 0.4062779 0.4141955 0.8174018
## ---------------------------------------------------------------------------
## S6_N
## n missing distinct Info Mean Gmd .05 .10
## 1080 0 1080 1 0.4292 0.1562 0.2426 0.2733
## .25 .50 .75 .90 .95
## 0.3167 0.4010 0.5349 0.6240 0.6699
##
## lowest : 0.1302063 0.1455924 0.1549423 0.1622186 0.1701055
## highest: 0.7806789 0.7870346 0.7877019 0.8073863 0.8226108
## ---------------------------------------------------------------------------
## ADARB1_N
## n missing distinct Info Mean Gmd .05 .10
## 1080 0 1080 1 1.197 0.3942 0.7433 0.8165
## .25 .50 .75 .90 .95
## 0.9305 1.1283 1.3802 1.6893 1.9296
##
## lowest : 0.5291078 0.5389338 0.5489707 0.5583358 0.5586451
## highest: 2.3835378 2.4026476 2.4047619 2.4675499 2.5398896
## ---------------------------------------------------------------------------
## AcetylH3K9_N
## n missing distinct Info Mean Gmd .05 .10
## 1080 0 1080 1 0.2165 0.168 0.07039 0.07959
## .25 .50 .75 .90 .95
## 0.10357 0.15042 0.26965 0.37512 0.58872
##
## lowest : 0.05252842 0.05265137 0.05385404 0.05401287 0.05458515
## highest: 1.11472275 1.12909633 1.15685651 1.34237767 1.45938685
## ---------------------------------------------------------------------------
## RRP1_N
## n missing distinct Info Mean Gmd .05 .10
## 1080 0 1080 1 0.1666 0.02866 0.1338 0.1401
## .25 .50 .75 .90 .95
## 0.1490 0.1621 0.1774 0.1968 0.2116
##
## lowest : -0.06200787 0.11689974 0.11897251 0.11897888 0.11915638
## highest: 0.34111544 0.35211451 0.39148040 0.42855903 0.61237703
## ---------------------------------------------------------------------------
## BAX_N
## n missing distinct Info Mean Gmd .05 .10
## 1080 0 1080 1 0.1793 0.0208 0.1469 0.1551
## .25 .50 .75 .90 .95
## 0.1682 0.1807 0.1916 0.2021 0.2086
##
## lowest : 0.07232553 0.10230393 0.11203461 0.11329605 0.11648966
## highest: 0.22014093 0.22115783 0.22282524 0.22378610 0.24114105
## ---------------------------------------------------------------------------
## ARC_N
## n missing distinct Info Mean Gmd .05 .10
## 1080 0 1080 1 0.1215 0.01628 0.09864 0.10309
## .25 .50 .75 .90 .95
## 0.11084 0.12163 0.13196 0.14017 0.14487
##
## lowest : 0.06725429 0.07544495 0.07680510 0.08379825 0.08439776
## highest: 0.15402073 0.15411897 0.15687826 0.15736561 0.15874781
## ---------------------------------------------------------------------------
## ERBB4_N
## n missing distinct Info Mean Gmd .05 .10
## 1080 0 1079 1 0.1565 0.01675 0.1325 0.1389
## .25 .50 .75 .90 .95
## 0.1470 0.1564 0.1654 0.1757 0.1823
##
## lowest : 0.1002173 0.1058711 0.1079084 0.1083058 0.1103268
## highest: 0.1970899 0.1974948 0.1991830 0.2002162 0.2086975
## ---------------------------------------------------------------------------
## nNOS_N
## n missing distinct Info Mean Gmd .05 .10
## 1080 0 1079 1 0.1813 0.02796 0.1372 0.1475
## .25 .50 .75 .90 .95
## 0.1665 0.1827 0.1986 0.2115 0.2194
##
## lowest : 0.09973436 0.10631596 0.10702494 0.11255309 0.11293103
## highest: 0.24577523 0.25297377 0.25599706 0.26029255 0.26073863
## ---------------------------------------------------------------------------
## Tau_N
## n missing distinct Info Mean Gmd .05 .10
## 1080 0 1080 1 0.2105 0.0691 0.1415 0.1485
## .25 .50 .75 .90 .95
## 0.1680 0.1886 0.2339 0.3040 0.3493
##
## lowest : 0.09623279 0.09969325 0.10015519 0.11597875 0.11728308
## highest: 0.52240543 0.53088381 0.56712673 0.58043616 0.60276806
## ---------------------------------------------------------------------------
## GFAP_N
## n missing distinct Info Mean Gmd .05 .10
## 1080 0 1079 1 0.1209 0.01429 0.1000 0.1046
## .25 .50 .75 .90 .95
## 0.1128 0.1205 0.1277 0.1365 0.1421
##
## lowest : 0.08611418 0.08766644 0.08783043 0.08821604 0.08853316
## highest: 0.17530140 0.17558779 0.17828298 0.19703050 0.21362064
## ---------------------------------------------------------------------------
## GluR3_N
## n missing distinct Info Mean Gmd .05 .10
## 1080 0 1080 1 0.2219 0.03932 0.1743 0.1827
## .25 .50 .75 .90 .95
## 0.1957 0.2169 0.2460 0.2701 0.2844
##
## lowest : 0.1113821 0.1200208 0.1244546 0.1268955 0.1282082
## highest: 0.3148309 0.3182612 0.3203194 0.3203515 0.3310159
## ---------------------------------------------------------------------------
## GluR4_N
## n missing distinct Info Mean Gmd .05 .10
## 1080 0 1079 1 0.1266 0.02613 0.09306 0.09931
## .25 .50 .75 .90 .95
## 0.10889 0.12355 0.14195 0.15542 0.16365
##
## lowest : 0.07257968 0.07329147 0.07336244 0.08034450 0.08215206
## highest: 0.18831047 0.18891397 0.25719720 0.42069863 0.53700410
## ---------------------------------------------------------------------------
## IL1B_N
## n missing distinct Info Mean Gmd .05 .10
## 1080 0 1080 1 0.5273 0.09175 0.3920 0.4222
## .25 .50 .75 .90 .95
## 0.4756 0.5267 0.5770 0.6276 0.6629
##
## lowest : 0.2840013 0.2962466 0.3045522 0.3130310 0.3296246
## highest: 0.7703358 0.7761248 0.7831150 0.8423141 0.8897351
## ---------------------------------------------------------------------------
## P3525_N
## n missing distinct Info Mean Gmd .05 .10
## 1080 0 1080 1 0.2913 0.03395 0.2434 0.2542
## .25 .50 .75 .90 .95
## 0.2701 0.2906 0.3116 0.3295 0.3415
##
## lowest : 0.2074378 0.2129542 0.2155300 0.2157518 0.2166631
## highest: 0.3625067 0.3651188 0.3655389 0.3667681 0.4437350
## ---------------------------------------------------------------------------
## pCASP9_N
## n missing distinct Info Mean Gmd .05 .10
## 1080 0 1080 1 1.548 0.2774 1.191 1.248
## .25 .50 .75 .90 .95
## 1.376 1.523 1.713 1.880 1.971
##
## lowest : 0.8531756 0.8974752 0.9157394 0.9458182 0.9664215
## highest: 2.4031145 2.4548915 2.5101213 2.5314108 2.5862159
## ---------------------------------------------------------------------------
## PSD95_N
## n missing distinct Info Mean Gmd .05 .10
## 1080 0 1080 1 2.235 0.2844 1.799 1.910
## .25 .50 .75 .90 .95
## 2.079 2.242 2.420 2.543 2.603
##
## lowest : 1.206098 1.321953 1.331925 1.345682 1.402583
## highest: 2.837367 2.844401 2.851444 2.851852 2.877873
## ---------------------------------------------------------------------------
## SNCA_N
## n missing distinct Info Mean Gmd .05 .10
## 1080 0 1079 1 0.1598 0.02688 0.1245 0.1306
## .25 .50 .75 .90 .95
## 0.1428 0.1575 0.1733 0.1920 0.2056
##
## lowest : 0.1012332 0.1081962 0.1082343 0.1092731 0.1094067
## highest: 0.2352558 0.2366942 0.2429692 0.2512195 0.2576159
## ---------------------------------------------------------------------------
## Ubiquitin_N
## n missing distinct Info Mean Gmd .05 .10
## 1080 0 1080 1 1.239 0.1967 0.9608 1.0097
## .25 .50 .75 .90 .95
## 1.1163 1.2366 1.3631 1.4646 1.5112
##
## lowest : 0.7506641 0.8118216 0.8307528 0.8308095 0.8321190
## highest: 1.7328926 1.7407243 1.8393543 1.8936446 1.8972023
## ---------------------------------------------------------------------------
## pGSK3B_Tyr216_N
## n missing distinct Info Mean Gmd .05 .10
## 1080 0 1080 1 0.8488 0.1062 0.6839 0.7196
## .25 .50 .75 .90 .95
## 0.7937 0.8499 0.9162 0.9662 0.9935
##
## lowest : 0.5773968 0.5819287 0.5977821 0.6051203 0.6062151
## highest: 1.1164285 1.1198982 1.1463778 1.1508555 1.2045981
## ---------------------------------------------------------------------------
## SHH_N
## n missing distinct Info Mean Gmd .05 .10
## 1080 0 1080 1 0.2267 0.03151 0.1867 0.1938
## .25 .50 .75 .90 .95
## 0.2064 0.2240 0.2417 0.2609 0.2803
##
## lowest : 0.1558693 0.1626419 0.1655822 0.1698028 0.1725882
## highest: 0.3391304 0.3443132 0.3522055 0.3560009 0.3582888
## ---------------------------------------------------------------------------
## BAD_N
## n missing distinct Info Mean Gmd .05 .10
## 867 213 866 1 0.1579 0.03261 0.1189 0.1248
## .25 .50 .75 .90 .95
## 0.1364 0.1523 0.1740 0.1984 0.2136
##
## lowest : 0.08830462 0.09236149 0.09768409 0.10076730 0.10223765
## highest: 0.24734918 0.25251201 0.25524157 0.26619343 0.28201635
## ---------------------------------------------------------------------------
## BCL2_N
## n missing distinct Info Mean Gmd .05 .10
## 795 285 795 1 0.1348 0.02942 0.1005 0.1058
## .25 .50 .75 .90 .95
## 0.1156 0.1295 0.1482 0.1708 0.1859
##
## lowest : 0.08065685 0.08447381 0.08950891 0.09212676 0.09280994
## highest: 0.23393124 0.23802129 0.24309952 0.26094571 0.26150572
## ---------------------------------------------------------------------------
## pS6_N
## n missing distinct Info Mean Gmd .05 .10
## 1080 0 1080 1 0.1215 0.01628 0.09864 0.10309
## .25 .50 .75 .90 .95
## 0.11084 0.12163 0.13196 0.14017 0.14487
##
## lowest : 0.06725429 0.07544495 0.07680510 0.08379825 0.08439776
## highest: 0.15402073 0.15411897 0.15687826 0.15736561 0.15874781
## ---------------------------------------------------------------------------
## pCFOS_N
## n missing distinct Info Mean Gmd .05 .10
## 1005 75 1005 1 0.1311 0.02587 0.1009 0.1051
## .25 .50 .75 .90 .95
## 0.1135 0.1265 0.1437 0.1635 0.1747
##
## lowest : 0.08541915 0.09067696 0.09068140 0.09109544 0.09176119
## highest: 0.24039102 0.24271201 0.24535679 0.24768212 0.25652893
## ---------------------------------------------------------------------------
## SYP_N
## n missing distinct Info Mean Gmd .05 .10
## 1080 0 1079 1 0.4461 0.07471 0.3365 0.3622
## .25 .50 .75 .90 .95
## 0.3981 0.4485 0.4908 0.5265 0.5530
##
## lowest : 0.2586258 0.2601463 0.2733639 0.2776650 0.2809251
## highest: 0.6059889 0.6362787 0.7221362 0.7406525 0.7595884
## ---------------------------------------------------------------------------
## H3AcK18_N
## n missing distinct Info Mean Gmd .05 .10
## 900 180 900 1 0.1696 0.06124 0.1027 0.1105
## .25 .50 .75 .90 .95
## 0.1258 0.1582 0.1979 0.2366 0.2836
##
## lowest : 0.07969090 0.08236342 0.08530777 0.08678524 0.08685403
## highest: 0.45084043 0.45355352 0.45509434 0.46016787 0.47976327
## ---------------------------------------------------------------------------
## EGR1_N
## n missing distinct Info Mean Gmd .05 .10
## 870 210 870 1 0.1831 0.04362 0.1332 0.1402
## .25 .50 .75 .90 .95
## 0.1551 0.1749 0.2045 0.2363 0.2597
##
## lowest : 0.1055372 0.1061346 0.1070948 0.1074335 0.1081293
## highest: 0.3230668 0.3261426 0.3371865 0.3408836 0.3606921
## ---------------------------------------------------------------------------
## H3MeK4_N
## n missing distinct Info Mean Gmd .05 .10
## 810 270 810 1 0.2054 0.06065 0.1345 0.1453
## .25 .50 .75 .90 .95
## 0.1651 0.1940 0.2352 0.2833 0.3144
##
## lowest : 0.1017870 0.1049287 0.1126895 0.1138108 0.1151487
## highest: 0.3820815 0.3833175 0.3879541 0.3958202 0.4139027
## ---------------------------------------------------------------------------
## CaNA_N
## n missing distinct Info Mean Gmd .05 .10
## 1080 0 1080 1 1.338 0.3632 0.8870 0.9511
## .25 .50 .75 .90 .95
## 1.0814 1.3174 1.5858 1.7695 1.8668
##
## lowest : 0.5864788 0.6275812 0.6433178 0.6433882 0.6614039
## highest: 2.0744911 2.0770839 2.1038759 2.1155551 2.1297911
## ---------------------------------------------------------------------------
gene<-mice_dat
b).Find the missing value, then delete variables with more than 10% missing value and using mean imputation to deal with others.
N_NA<-sapply(mice_dat[,c(2:78)], function(x) sum(is.na(x)))
N_NA<-N_NA/dim(mice_dat)[1]
#remove variables with more than 10% missing values
gene<-gene[,!names(gene)%in%names(N_NA)[which(N_NA>=0.1)]]
#a simple function to perform mean imputation
NA2mean <- function(x) replace(x, is.na(x), mean(x, na.rm = TRUE))
gene[,c(2:73)]<-lapply(gene[,c(2:73)], NA2mean)
2.a). Perform PCA on imputed expression measurements
pca = prcomp(gene[,c(2:73)], center = TRUE)
#explained variance
expl.var <- pca$sdev^2/sum(pca$sdev^2)*100 # percent explained variance
cum.var<-cumsum(expl.var)
#Plot PC in ggplot2
dat<-data.frame(expl.var, cum.var)
dat$PC<-1:72
p0<-ggplot(dat, aes(x = PC, y = expl.var)) + geom_point()+geom_line()
p0<-p0+xlab("Principal Component")
p0<-p0+ylab("Cumulative Proportion of variance")
p0
p<-ggplot(dat, aes(x = PC, y = cum.var)) + geom_point()+geom_line()
p<-p+xlab("Principal Component")
p<-p+ylab("Cumulative Proportion of variance")
p
According to the scree plot, with around 19 PCs, the cumulative proportional variance reach to nearly 99%.
b). Scale the data and repeat the above process.
pca3 = prcomp(gene[,c(2:73)], center = TRUE, scale. = TRUE)
#explained variance
expl.var <- pca3$sdev^2/sum(pca3$sdev^2)*100 #percent explained variance
cum.var<-cumsum(expl.var)
#Plot PC in ggplot2
dat<-data.frame(expl.var, cum.var)
dat$PC<-1:72
p<-ggplot(dat, aes(x = PC, y = expl.var)) + geom_point()+geom_line()
p<-p+xlab("Principal Component")
p<-p+ylab("Proportion of variance")
p
#variance for each gene expression
t<-apply(gene[,c(2:73)], 2, var)
dat0<-data.frame(gene = names(gene[,c(2:73)]), variance = t)
ggplot(data = dat0, aes(x = gene, y = variance))+
geom_point()+
theme_classic()+
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 3))
After scaling the data, the explained proportion of variance in the first several PCs were significantly reduced compare to the one without scaling. The possible reason is that there are several outliers of genes with large variance, such as pCAMKII_N.
c).
According to the plot, overall, there is no significant good separation by PC1 or PC2 in recapturing homogeneous groups of behaviors, treatment, Genotype, and class. Most of the groups have overlapping observations in the PC space except for behavior. PC2 may have some separation in behavior.
d). i. Fit a logistic regression using PC scores
temp<-as.data.frame(pca$x)
temp$Genotype<-ifelse(gene$Genotype == "Control", 0, 1)
temp$Genotype<-as.factor(temp$Genotype)
fit <- glm(Genotype~., data = temp, family=binomial(link='logit'))
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(fit)
##
## Call:
## glm(formula = Genotype ~ ., family = binomial(link = "logit"),
## data = temp)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.511e-04 -2.100e-08 -2.100e-08 2.100e-08 1.588e-04
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.897e-01 1.543e+04 0.000 1.000
## PC1 -1.562e+00 7.093e+03 0.000 1.000
## PC2 -1.269e+01 1.279e+04 -0.001 0.999
## PC3 1.360e+02 3.351e+04 0.004 0.997
## PC4 7.691e+01 1.541e+04 0.005 0.996
## PC5 -1.502e+02 3.425e+04 -0.004 0.996
## PC6 -5.153e+01 4.216e+04 -0.001 0.999
## PC7 -4.360e+01 4.299e+04 -0.001 0.999
## PC8 -1.226e+02 1.139e+05 -0.001 0.999
## PC9 -1.736e+02 7.148e+04 -0.002 0.998
## PC10 -3.275e+02 1.262e+05 -0.003 0.998
## PC11 -2.359e+02 7.925e+04 -0.003 0.998
## PC12 1.227e+01 3.661e+05 0.000 1.000
## PC13 -3.005e+02 2.224e+05 -0.001 0.999
## PC14 -6.848e+01 1.650e+05 0.000 1.000
## PC15 4.152e+02 1.586e+05 0.003 0.998
## PC16 5.958e+02 2.142e+05 0.003 0.998
## PC17 -9.404e+02 1.591e+05 -0.006 0.995
## PC18 1.566e+03 1.710e+05 0.009 0.993
## PC19 -2.218e+02 1.845e+05 -0.001 0.999
## PC20 -5.634e+02 2.674e+05 -0.002 0.998
## PC21 -1.144e+02 1.855e+05 -0.001 1.000
## PC22 -1.732e+03 2.769e+05 -0.006 0.995
## PC23 -7.662e+02 3.274e+05 -0.002 0.998
## PC24 5.095e+01 3.684e+05 0.000 1.000
## PC25 -5.576e+02 1.314e+05 -0.004 0.997
## PC26 -6.791e+02 1.594e+05 -0.004 0.997
## PC27 -1.925e+03 3.401e+05 -0.006 0.995
## PC28 -3.653e+02 5.557e+05 -0.001 0.999
## PC29 -2.362e+03 6.380e+05 -0.004 0.997
## PC30 3.653e+02 2.749e+05 0.001 0.999
## PC31 -8.847e+02 7.635e+05 -0.001 0.999
## PC32 -1.008e+03 8.376e+05 -0.001 0.999
## PC33 1.410e+03 3.782e+05 0.004 0.997
## PC34 2.053e+03 4.193e+05 0.005 0.996
## PC35 -1.851e+03 1.187e+06 -0.002 0.999
## PC36 1.129e+03 9.199e+05 0.001 0.999
## PC37 2.644e+02 1.115e+06 0.000 1.000
## PC38 1.126e+03 3.975e+05 0.003 0.998
## PC39 3.968e+02 5.356e+05 0.001 0.999
## PC40 -2.572e+02 2.877e+05 -0.001 0.999
## PC41 -2.361e+03 1.105e+06 -0.002 0.998
## PC42 1.796e+03 6.255e+05 0.003 0.998
## PC43 -9.353e+02 6.501e+05 -0.001 0.999
## PC44 1.709e+03 7.840e+05 0.002 0.998
## PC45 -1.904e+03 8.120e+05 -0.002 0.998
## PC46 1.167e+03 1.315e+06 0.001 0.999
## PC47 -1.189e+03 8.850e+05 -0.001 0.999
## PC48 2.472e+03 5.507e+05 0.004 0.996
## PC49 -1.104e+03 6.358e+05 -0.002 0.999
## PC50 -1.643e+03 8.151e+05 -0.002 0.998
## PC51 -1.844e+03 1.223e+06 -0.002 0.999
## PC52 1.172e+03 1.557e+06 0.001 0.999
## PC53 2.261e+03 1.829e+06 0.001 0.999
## PC54 2.202e+02 9.896e+05 0.000 1.000
## PC55 1.963e+03 4.718e+05 0.004 0.997
## PC56 1.605e+03 1.430e+06 0.001 0.999
## PC57 2.142e+03 1.138e+06 0.002 0.998
## PC58 -1.695e+03 9.246e+05 -0.002 0.999
## PC59 3.720e+03 1.845e+06 0.002 0.998
## PC60 -2.772e+03 9.049e+05 -0.003 0.998
## PC61 -5.348e+02 1.433e+06 0.000 1.000
## PC62 3.329e+02 1.277e+06 0.000 1.000
## PC63 2.599e+03 2.416e+06 0.001 0.999
## PC64 9.139e+02 1.128e+06 0.001 0.999
## PC65 1.213e+03 4.147e+06 0.000 1.000
## PC66 7.976e+01 9.888e+05 0.000 1.000
## PC67 6.641e+02 1.570e+06 0.000 1.000
## PC68 -2.177e+03 1.751e+06 -0.001 0.999
## PC69 4.974e+02 3.631e+06 0.000 1.000
## PC70 6.282e+03 1.891e+06 0.003 0.997
## PC71 4.059e+02 2.078e+06 0.000 1.000
## PC72 9.403e+18 3.743e+21 0.003 0.998
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1.4939e+03 on 1079 degrees of freedom
## Residual deviance: 4.4879e-07 on 1007 degrees of freedom
## AIC: 146
##
## Number of Fisher Scoring iterations: 25
Because I have 72 varibles in my model, I am confronted to an over-fitting problem: too many variables in the model, leading to a perfect separation of genotypes. The consequences is that model likelihood is not defined, and thus I can’t get the model to converge.
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
fit2<-glm(Genotype~., data = temp[,c(1:19, 73)], family=binomial(link='logit'))
summary(fit2)
##
## Call:
## glm(formula = Genotype ~ ., family = binomial(link = "logit"),
## data = temp[, c(1:19, 73)])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7811 -0.5693 -0.1208 0.5530 3.2466
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.238256 0.093173 -2.557 0.010554 *
## PC1 0.190359 0.060989 3.121 0.001801 **
## PC2 -0.486991 0.081643 -5.965 2.45e-09 ***
## PC3 1.162662 0.141285 8.229 < 2e-16 ***
## PC4 0.093566 0.160263 0.584 0.559337
## PC5 -2.387215 0.278931 -8.558 < 2e-16 ***
## PC6 -1.204505 0.289795 -4.156 3.23e-05 ***
## PC7 -1.116698 0.345625 -3.231 0.001234 **
## PC8 -1.529011 0.368779 -4.146 3.38e-05 ***
## PC9 -2.671859 0.417346 -6.402 1.53e-10 ***
## PC10 -3.190442 0.506785 -6.295 3.06e-10 ***
## PC11 -1.030280 0.607791 -1.695 0.090052 .
## PC12 -0.071460 0.508266 -0.141 0.888190
## PC13 -2.867598 0.690202 -4.155 3.26e-05 ***
## PC14 0.005547 0.611633 0.009 0.992763
## PC15 5.959579 0.755457 7.889 3.05e-15 ***
## PC16 3.549099 0.836709 4.242 2.22e-05 ***
## PC17 -9.819356 1.118729 -8.777 < 2e-16 ***
## PC18 13.452681 1.129721 11.908 < 2e-16 ***
## PC19 -3.629204 1.003107 -3.618 0.000297 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1493.86 on 1079 degrees of freedom
## Residual deviance: 811.68 on 1060 degrees of freedom
## AIC: 851.68
##
## Number of Fisher Scoring iterations: 6
gene$Genotype<-ifelse(gene$Genotype=="Control", 0, 1)
pcr_model <- pcr(Genotype~., data = gene[,c(2:74)], scale = TRUE, validation = "CV")
validationplot(pcr_model)
We found using the meaningful PCs only still have a good model of fit. By validation plot, we found RMSE for fitted model didn’t change a lot after more PCs are adding to the meaningful PCs.
clusters <- hclust(dist(gene[,2:73]))
plot(clusters, labels = FALSE)
From the cluster dendrogram, k = 2, 3 or 4 may be optimal.
pamk.best <- pamk(gene[,2:73])
cat("number of clusters estimated by optimum average silhouette width:", pamk.best$nc, "\n")
## number of clusters estimated by optimum average silhouette width: 3
plot(pam(gene[,2:73], pamk.best$nc))
By estimating silhouette width after partitioning, I found the optimal k is 3.
gene$fit.cluster<-cutree(clusters, 3)
sum0<-data.frame(Treatment = purity(gene$fit.cluster, gene$Treatment),
Genotype = purity(gene$fit.cluster, gene$Genotype),
Behavior = purity(gene$fit.cluster, gene$Behavior),
class = purity(gene$fit.cluster, gene$class))
sum0<-format(sum0, digits = 3)
row.names(sum0)<-"Purity"
htmlTable(sum0, caption = "Table 1. Purity after applying hierachial clustering with optimal k=3")
| Table 1. Purity after applying hierachial clustering with optimal k=3 | ||||
| Treatment | Genotype | Behavior | class | |
|---|---|---|---|---|
| Purity | 0.542 | 0.528 | 0.536 | 0.176 |
cor_dis<-1-cor(gene[,c(2:73)])
dis<-as.dist(cor_dis)
plot(hclust(dis),
main="Dissimilarity = 1 - Correlation", xlab = "Protein", cex = 0.5)
b). Hierarchical Clustering:
Hs <- hclust(dis, method = "single")
Ha <- hclust(dis, method = "average")
Hc <- hclust(dis, method = "complete")
plot(Hs, cex = 0.5)
plot(Ha, cex = 0.5)
plot(Hc, cex = 0.5)
#cluster assignment
assign<-data.frame(Single = cutree(Hs, k =4),
Average = cutree(Ha, k =4),
Complete = cutree(Hc, k =4))
table(assign$Single)
##
## 1 2 3 4
## 67 1 2 2
table(assign$Average)
##
## 1 2 3 4
## 48 17 3 4
table(assign$Complete)
##
## 1 2 3 4
## 20 25 15 12
Different methods of linkage has very different cluster assignments. For instance, when using single linkage, most of proteins were assigned to one cluster.
#standardize the data
gene_st<-as.data.frame(lapply(gene[,c(2:73)], function(x) (x-mean(x))/sd(x)))
cor_dis_st<-1-cor(gene_st)
dis_st<-as.dist(cor_dis_st)
Hc_st <- hclust(dis_st, method = "complete")
plot(Hc_st, cex = 0.5)
table(cutree(Hc_st, k = 4))
##
## 1 2 3 4
## 20 25 15 12
After standardizing the data of protein expression, the cluster assigment did not change since calculating correlation matrix is not impacted by scaling data.
Euc_matrix<-dist(t(gene[,2:73]), method = "euclidean")
dis_euc<-as.dist(Euc_matrix)
Hc_euc<-hclust(dis_euc, method = "complete")
plot(Hc_euc, cex = 0.5)
table(cutree(Hc_euc, k = 4))
##
## 1 2 3 4
## 56 4 2 10
Compare to using correlation distance as distance metrics, more proteins were assigned to one cluster after using Euclidean distance metrics.
##
## 1 2 3 4
## 9 24 26 13
| Table 2. Cluster assignment after sampling (Repeats = 5) | |||||
| Repeat:1 | Repeat:2 | Repeat:3 | Repeat:4 | Repeat:5 | |
|---|---|---|---|---|---|
| Cluster=1 | 9 | 26 | 34 | 29 | 13 |
| Cluster=2 | 24 | 17 | 20 | 18 | 25 |
| Cluster=3 | 26 | 17 | 15 | 16 | 13 |
| Cluster=4 | 13 | 12 | 3 | 9 | 21 |
After repeating sampling process before applying clustering algorithm, we found the results of cluster assignment is similar to the one without sampling.